library(dplyr)
library(readr)
library(DataExplorer)
library(cluster)
library(factoextra)
library(FactoMineR)
library(janitor)
library(ggplot2)
library(tidyr)
library(reshape2)
library(stringr)
library(scales)
library(plotly)

knitr::opts_chunk$set(echo = TRUE)
toxins_2023 <- read_csv("C:/Users/wanel/OneDrive/Documents/GitHub/STAT-442-Final-Project/Datasets/2023_us.csv")
## Rows: 77964 Columns: 122
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (31): 2. TRIFD, 4. FACILITY NAME, 5. STREET ADDRESS, 6. CITY, 7. COUNTY,...
## dbl (84): 1. YEAR, 3. FRS ID, 10. BIA, 12. LATITUDE, 13. LONGITUDE, 22. INDU...
## lgl  (7): 24. PRIMARY SIC, 25. SIC 2, 26. SIC 3, 27. SIC 4, 28. SIC 5, 29. S...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
toxins_clean <- toxins_2023 %>%
  select(-c(24:29)) %>%
  mutate(`45. METAL CATEGORY` = ifelse(`45. METAL CATEGORY` == "Metal complound categories", "Metal compound categories",`45. METAL CATEGORY` )) 
# Data preparation for on-site
toxins_clustered_on_site <- toxins_2023 %>%
  select(c(51:53, 55, 56, 58:60, 62:65)) %>%  # Select specified columns
  na.omit() %>%                              # Remove rows with missing values
  filter(.[[ncol(.)]] != 0)                  # Remove rows where column 65 has a value of 0

# Scaling the data
toxins_clustered_on_site_scaled <- scale(toxins_clustered_on_site)

# Data preparation for off-site
toxins_clustered_off_site <- toxins_2023 %>%
  select(c(66:71, 75, 76, 79:88)) %>%  # Select specified columns
  na.omit() %>%                        # Remove rows with missing values
  filter(.[[ncol(.)]] != 0)            # Remove rows where column 88 has a value of 0



# Scaling the data
toxins_clustered_off_site_scaled <- scale(toxins_clustered_off_site)

sample_size <- 10000

# For on-site data
random_sample_on_site <- toxins_clustered_on_site_scaled[sample(1:nrow(toxins_clustered_on_site_scaled), sample_size), ]

# For off-site data
random_sample_off_site <- toxins_clustered_off_site_scaled[sample(1:nrow(toxins_clustered_off_site_scaled), sample_size), ]

# Step 2: Visualize the elbow method using fviz_nbclust (within-cluster sum of squares)
# On-site data
fviz_nbclust(random_sample_on_site, kmeans, method = "silhouette") +
  ggtitle("Silhouette Method for On-site Data")

# Off-site data
fviz_nbclust(random_sample_off_site, kmeans, method = "silhouette") +
  ggtitle("Silhouette Method for Off-site Data")

# Clustering with 2 clusters for on-site
kmeans_on_site <- kmeans(toxins_clustered_on_site_scaled, centers = 2)
kmeans_on_site$centers
##   51. 5.1 - FUGITIVE AIR 52. 5.2 - STACK AIR 53. 5.3 - WATER
## 1           1.263709e-01       -8.520503e-02   -3.475093e-02
## 2          -6.806698e-06        4.589387e-06    1.871785e-06
##   55. 5.4.1 - UNDERGROUND CL I 56. 5.4.2 - UNDERGROUND C II-V
## 1                -2.606665e-02                  -8.729126e-03
## 2                 1.404024e-06                   4.701757e-07
##   58. 5.5.1A - RCRA C LANDFILL 59. 5.5.1B - OTHER LANDFILLS
## 1                -3.235243e-02                -4.222658e-02
## 2                 1.742595e-06                 2.274444e-06
##   60. 5.5.2 - LAND TREATMENT 62. 5.5.3A - RCRA SURFACE IM
## 1              -1.793947e-02                -4.484244e-03
## 2               9.662714e-07                 2.415342e-07
##   63. 5.5.3B - OTHER SURFACE I 64. 5.5.4 - OTHER DISPOSAL
## 1                105.530269207              129.879566229
## 2                 -0.005684163               -0.006995686
##   65. ON-SITE RELEASE TOTAL
## 1             125.908039405
## 2              -0.006781768
kmeans_on_site$size
## [1]     3 55697
kmeans_on_site$tot.withinss
## [1] 536806.3
# Display the ratio of between_SS to total_SS
between_tot_ratio_on <- kmeans_on_site$betweenss / kmeans_on_site$totss * 100
cat("Between_SS / Total_SS =", round(between_tot_ratio_on, 1), "%\n")
## Between_SS / Total_SS = 19.7 %
fviz_cluster(kmeans_on_site, data = toxins_clustered_on_site_scaled, 
             geom = "point", ellipse.type = "norm", 
             ggtheme = theme_minimal())
## Too few points to calculate an ellipse

# Clustering with 6 clusters for off-site
kmeans_off_site <- kmeans(toxins_clustered_off_site_scaled, centers = 2)
kmeans_off_site$centers
##   66. 6.1 - POTW - TRNS RLSE 67. 6.1 - POTW - TRNS TRT
## 1                 8.71454390               10.68526884
## 2                -0.01672721               -0.02050994
##   68. POTW - TOTAL TRANSFERS 69. 6.2 - M10 70. 6.2 - M41 71. 6.2 - M62
## 1                11.23359393 -4.296281e-02   3.616476185   -0.02662729
## 2                -0.02156243  8.246536e-05  -0.006941679    0.00005111
##   75. 6.2 - M81 76. 6.2 - M82 79. 6.2 - M66 80. 6.2 - M67 81. 6.2 - M64
## 1   2.478007849     5.3915696 -7.387454e-03 -1.828471e-02   11.78490981
## 2  -0.004756436    -0.0103489  1.417992e-05  3.509676e-05   -0.02262066
##   82. 6.2 - M65 83. 6.2 - M73 84. 6.2 - M79 85. 6.2 - M90 86. 6.2 - M94
## 1   4.142540154 -2.391045e-02 -3.897248e-02   4.269182354 -3.737444e-02
## 2  -0.007951438  4.589514e-05  7.480609e-05  -0.008194523  7.173872e-05
##   87. 6.2 - M99 88. OFF-SITE RELEASE TOTAL
## 1 -4.438416e-02                14.73970989
## 2  8.519359e-05                -0.02829228
kmeans_off_site$size
## [1]    50 26049
kmeans_off_site$tot.withinss
## [1] 431884.5
# Display the ratio of between_SS to total_SS
between_tot_ratio_off <- kmeans_off_site$betweenss / kmeans_off_site$totss * 100
cat("Between_SS / Total_SS =", round(between_tot_ratio_off, 1), "%\n")
## Between_SS / Total_SS = 8.1 %
fviz_cluster(kmeans_off_site, data = toxins_clustered_off_site_scaled, 
             geom = "point", ellipse.type = "norm", 
             ggtheme = theme_minimal())

# Data preparation for total
toxins_clustered_total <- toxins_2023 %>%
  select(c(65, 88, 94, 113, 116, 119, 107, 106, 120)) %>%
  na.omit()

# Scaling the data
toxins_clustered_total_scaled <- scale(toxins_clustered_total)

# For total data
random_sample_total <- toxins_clustered_total_scaled[sample(1:nrow(toxins_clustered_total_scaled), sample_size), ]

# Step 2: Visualize the elbow method using fviz_nbclust (within-cluster sum of squares)
# On-site data
fviz_nbclust(random_sample_total, kmeans, method = "silhouette") +
  ggtitle("Silhouette Method for Total Data")

# Clustering with 2 clusters for on-site
kmeans_total <- kmeans(toxins_clustered_total_scaled, centers = 2)
kmeans_total$centers
##   65. ON-SITE RELEASE TOTAL 88. OFF-SITE RELEASE TOTAL
## 1               -0.01578838                -0.01207179
## 2               10.00797559                 7.65209425
##   94. OFF-SITE RECYCLED TOTAL 113. 8.2 - ENERGY RECOVER ON
## 1                 -0.01913677                  -0.01362149
## 2                 12.13046288                   8.63442099
##   116. 8.5 - RECYCLING OFF SIT 119. PRODUCTION WSTE (8.1-8.7)
## 1                  -0.01919015                    -0.02149019
## 2                  12.16429663                    13.62225482
##   107. TOTAL RELEASES 106. 6.2 - TOTAL TRANSFER 120. 8.8 - ONE-TIME RELEASE
## 1         -0.01640607               -0.01579178                 -0.01081905
## 2         10.39951763               10.01012788                  6.85800245
kmeans_total$size
## [1] 10776    17
kmeans_total$tot.withinss
## [1] 80623.29
# Display the ratio of between_SS to total_SS
between_tot_ratio_total <- kmeans_total$betweenss / kmeans_total$totss * 100
cat("Between_SS / Total_SS =", round(between_tot_ratio_total, 1), "%\n")
## Between_SS / Total_SS = 17 %
fviz_cluster(kmeans_total, data = toxins_clustered_total_scaled, 
             geom = "point", ellipse.type = "norm", 
             ggtheme = theme_minimal())

## PCA

# Perform PCA on on-site scaled data
pca_on_site <- prcomp(toxins_clustered_on_site_scaled, center = TRUE, scale. = TRUE)

# Summary of PCA to see variance explained by each principal component
summary(pca_on_site)
## Importance of components:
##                           PC1     PC2     PC3     PC4    PC5     PC6     PC7
## Standard deviation     1.6531 1.05258 1.01878 1.00567 1.0010 1.00010 0.99999
## Proportion of Variance 0.2277 0.09233 0.08649 0.08428 0.0835 0.08335 0.08333
## Cumulative Proportion  0.2277 0.32005 0.40655 0.49083 0.5743 0.65768 0.74101
##                            PC8     PC9    PC10    PC11      PC12
## Standard deviation     0.99879 0.98111 0.94517 0.50430 2.959e-11
## Proportion of Variance 0.08313 0.08021 0.07445 0.02119 0.000e+00
## Cumulative Proportion  0.82415 0.90436 0.97881 1.00000 1.000e+00
# Visualize the PCA
# Scree plot to show the variance explained by each principal component
fviz_eig(pca_on_site)

# Alternatively, you can plot the contributions of each variable to the first two principal components
fviz_pca_var(pca_on_site, 
             col.var = "contrib", # color based on contributions of variables
             gradient.cols = c("blue", "green", "red"), 
             repel = TRUE) # repel text to avoid overlap

# Perform PCA on off-site scaled data
pca_off_site <- prcomp(toxins_clustered_off_site_scaled, center = TRUE, scale. = TRUE)

# Summary of PCA to see variance explained by each principal component
summary(pca_off_site)
## Importance of components:
##                           PC1    PC2     PC3     PC4     PC5     PC6     PC7
## Standard deviation     1.5712 1.4361 1.11480 1.00952 1.00709 1.00164 1.00055
## Proportion of Variance 0.1371 0.1146 0.06904 0.05662 0.05635 0.05574 0.05562
## Cumulative Proportion  0.1371 0.2517 0.32077 0.37738 0.43373 0.48947 0.54509
##                            PC8     PC9    PC10   PC11    PC12    PC13    PC14
## Standard deviation     1.00017 1.00006 1.00004 0.9986 0.99842 0.99203 0.99092
## Proportion of Variance 0.05557 0.05556 0.05556 0.0554 0.05538 0.05467 0.05455
## Cumulative Proportion  0.60066 0.65622 0.71178 0.7672 0.82257 0.87724 0.93179
##                           PC15    PC16      PC17      PC18
## Standard deviation     0.78911 0.77785 3.108e-10 1.812e-10
## Proportion of Variance 0.03459 0.03361 0.000e+00 0.000e+00
## Cumulative Proportion  0.96639 1.00000 1.000e+00 1.000e+00
# Visualize the PCA
# Scree plot to show the variance explained by each principal component
fviz_eig(pca_off_site)

# Alternatively, you can plot the contributions of each variable to the first two principal components
fviz_pca_var(pca_off_site, 
             col.var = "contrib", # color based on contributions of variables
             gradient.cols = c("blue", "green", "red"), 
             repel = TRUE) # repel text to avoid overlap

# Perform PCA on off-site scaled data
pca_total <- prcomp(toxins_clustered_total_scaled, center = TRUE, scale. = TRUE)

# Summary of PCA to see variance explained by each principal component
summary(pca_total)
## Importance of components:
##                           PC1    PC2    PC3    PC4    PC5     PC6     PC7
## Standard deviation     1.5130 1.4381 1.3773 1.1816 0.9213 0.61822 0.34114
## Proportion of Variance 0.2544 0.2298 0.2108 0.1551 0.0943 0.04247 0.01293
## Cumulative Proportion  0.2544 0.4841 0.6949 0.8500 0.9443 0.98678 0.99971
##                            PC8     PC9
## Standard deviation     0.05079 1.9e-11
## Proportion of Variance 0.00029 0.0e+00
## Cumulative Proportion  1.00000 1.0e+00
# Visualize the PCA
# Scree plot to show the variance explained by each principal component
fviz_eig(pca_total)

# Alternatively, you can plot the contributions of each variable to the first two principal components
fviz_pca_var(pca_total, 
             col.var = "contrib", # color based on contributions of variables
             gradient.cols = c("blue", "green", "red"), 
             repel = TRUE) # repel text to avoid overlap

# Step 1: Extract the first 2 PCs for on-site and off-site data
pca_scores_on_site <- pca_on_site$x[, 1:2]  # First 2 PCs for On-Site
pca_scores_off_site <- pca_off_site$x[, 1:2]  # First 2 PCs for Off-Site
pca_score_total <- pca_total$x[,1:2]

# Step 2: Combine PCA scores with the original data
pca_on_site <- cbind(toxins_clustered_on_site, pca_scores_on_site)
pca_off_site <- cbind(toxins_clustered_off_site, pca_scores_off_site)
pca_total <- cbind(toxins_clustered_total, pca_score_total)

# Step 3: Scale PCA scores and the original data
pca_on_site_scaled <- scale(pca_on_site[, (ncol(toxins_clustered_on_site) + 1):(ncol(toxins_clustered_on_site) + 2)])  # Scale PCA scores
pca_off_site_scaled <- scale(pca_off_site[, (ncol(toxins_clustered_off_site) + 1):(ncol(toxins_clustered_off_site) + 2)])  # Scale PCA scores
pca_total_scaled <- scale(pca_total[, (ncol(toxins_clustered_total) + 1):(ncol(toxins_clustered_total) + 2)])


# Step 4: Run k-means clustering for On-Site data (assume 6 clusters for demonstration)
k_on_site <- kmeans(pca_on_site[, ncol(pca_on_site)-1:ncol(pca_on_site)], centers = 2, nstart = 25)




# Step 5: Calculate the proportion of variance explained by the clustering
cat("Proportion of variance explained (On-Site):", k_on_site$betweenss / k_on_site$totss, "\n")
## Proportion of variance explained (On-Site): 0.8206545
# Step 6: Visualize clusters for On-Site data
fviz_cluster(list(data = pca_on_site[, ncol(pca_on_site)-1:ncol(pca_on_site)], cluster = k_on_site$cluster),
             geom = "point",
             ellipse.type = "convex",
             show.clust.cent = TRUE,
             main = "Clusters from PCA (On-Site Data)")

random_sample_on_site_pca <- pca_on_site_scaled[sample(1:nrow(pca_on_site_scaled), sample_size), ]

# For off-site data
random_sample_off_site_pca <- pca_off_site_scaled[sample(1:nrow(pca_off_site_scaled), sample_size), ]

random_sample_total_pca <- pca_total_scaled[sample(1:nrow(pca_total_scaled), sample_size), ]

# Step 2: Visualize the elbow method using fviz_nbclust (within-cluster sum of squares)
# On-site data
fviz_nbclust(random_sample_on_site_pca, kmeans, method = "silhouette") +
  ggtitle("Silhouette Method for On-site Data")

# Off-site data
fviz_nbclust(random_sample_off_site_pca, kmeans, method = "silhouette") +
  ggtitle("Silhouette Method for Off-site Data")

# Step 7: Run k-means clustering for Off-Site data (assume 6 clusters for demonstration)
k_off_site <- kmeans(pca_off_site[, ncol(pca_off_site)-1:ncol(pca_off_site)], centers = 2, nstart = 25)

# Step 8: Calculate the proportion of variance explained by the clustering
cat("Proportion of variance explained (Off-Site):", k_off_site$betweenss / k_off_site$totss, "\n")
## Proportion of variance explained (Off-Site): 0.2787647
# Step 9: Visualize clusters for Off-Site data
fviz_cluster(list(data = pca_off_site[, ncol(pca_off_site)-1:ncol(pca_off_site)], cluster = k_off_site$cluster),
            geom = "point",
             ellipse.type = "convex",
             show.clust.cent = TRUE,
             main = "Clusters from PCA (Off-Site Data)")

# Total data
fviz_nbclust(random_sample_total_pca, kmeans, method = "silhouette") +
  ggtitle("Silhouette Method for Total Data")

# Step 7: Run k-means clustering for Off-Site data (assume 6 clusters for demonstration)
k_total <- kmeans(pca_total[, ncol(pca_total)-1:ncol(pca_total)], centers = 2, nstart = 25)

# Step 8: Calculate the proportion of variance explained by the clustering
cat("Proportion of variance explained (Total):", k_total$betweenss / k_total$totss, "\n")
## Proportion of variance explained (Total): 0.5246632
# Step 9: Visualize clusters for Off-Site data
fviz_cluster(list(data = pca_total[, ncol(pca_total)-1:ncol(pca_total)], cluster = k_total$cluster),
            geom = "point",
             ellipse.type = "convex",
             show.clust.cent = TRUE,
             main = "Clusters from PCA (Total Data)")

Totals are very strong predictors.

MFA

# Step 1: Load necessary libraries
library(FactoMineR)
library(factoextra)

# Step 2: Prepare the data
mfa_toxins <- toxins_2023

# Step 3: Select variables for MFA (replace indices with appropriate column numbers)
MFAData <- toxins_2023[, c(37, 23, 46, 8, 44, 65, 88, 94, 106, 107, 119)]

# Step 4: Convert categorical variables to factors and numerical to numeric
# Adjust column indices based on the dataset
categorical_cols <- c(1, 2, 3, 4, 5)  # Example indices for categorical variables
numerical_cols <- c(6, 7, 8, 9, 10, 11)  # Example indices for numerical variables

MFAData[categorical_cols] <- lapply(MFAData[categorical_cols], as.factor)
MFAData[numerical_cols] <- lapply(MFAData[numerical_cols], as.numeric)

# Step 5: Remove rows with missing values
MFAData <- na.omit(MFAData)

# Step 6: Random sampling if dataset is too large
sample_size <- 1000  # Define the desired sample size
if (nrow(MFAData) > sample_size) {
  set.seed(123)  # Ensure reproducibility
  MFAData <- MFAData[sample(1:nrow(MFAData), sample_size), ]
}

# Step 7: Define groups and types for each variable
group_sizes <- rep(1, ncol(MFAData))  # Each variable is its own group
types <- c(rep("n", length(categorical_cols)), rep("s", length(numerical_cols)))  # Types for each variable

# Step 8: Run MFA
MFA1 <- MFA(
  MFAData,
  group = group_sizes,  # Each variable treated as a separate group
  type = types,         # Specify type for each variable
  name.group = colnames(MFAData),  # Use column names for group names
  graph = FALSE         # Do not automatically plot graphs
)

# Step 9: Visualizations

# Scree plot: variance explained by dimensions
fviz_screeplot(MFA1, addlabels = TRUE, ylim = c(0, 50))

# Variable contributions to the dimensions
fviz_mfa_var(MFA1,
             repel = TRUE,  # Avoid label overlap
             col.var = "cos2",  # Color by quality of representation (cos2 indicates the quality)
             gradient.cols = c("blue", "green", "red"))

# Individuals (data points) representation
fviz_mfa_ind(MFA1, 
             repel = TRUE,  # Avoid label overlap
             geom = "point", 
             col.ind = "cos2",  # Color by quality of representation (cos2 indicates the quality)
             gradient.cols = c("blue", "green", "red"))

# Variable contributions to the dimensions (by group)
fviz_mfa_var(MFA1, "group",
             repel = TRUE,  # Avoid label overlap
             col.var = "cos2",  # Color by quality of representation (cos2 indicates the quality)
             gradient.cols = c("blue", "green", "red"))

# Quantitative variables
fviz_mfa_var(MFA1, 
             choice = "quanti.var",  # For quantitative variables
             repel = TRUE,           # Avoid label overlap
             col.var = "cos2",       # Color by quality of representation (cos2 indicates the quality)
             gradient.cols = c("blue", "green", "red"))

# Load required libraries
library(dplyr)
library(janitor)
library(ggplot2)
library(tidyr)
library(reshape2)
library(stringr)
library(scales)

# Clean all column names and address multiple patterns like 'x#_', '5_5_1_', and '6_2_m79'
toxins_named <- toxins_2023 %>%
  clean_names() %>%
  rename_with(~ gsub("^x\\d+_", "", .), everything()) %>%  # Remove 'x#_' prefixes
  rename_with(~ gsub("^\\d+(_\\d+)*_", "", .), everything())  # Remove leading numeric patterns like '5_5_1_' or '6_2_'



toxins_named <- toxins_named %>%
  mutate(chemical = case_when(
    str_detect(chemical, "Chromium  and Chromium Compounds") ~ "Chromium and Chromium Compounds",
    str_detect(chemical, "Sulfuric acid") ~ "Sulfuric acid",
    str_detect(chemical, "Nitrate compounds") ~ "Nitrate compounds",
    str_detect(chemical, "Barium compounds") ~ "Barium compounds (except for barium sulfate)",
    TRUE ~ chemical
  ))

# Now proceed with your summary and plotting code
chem_release_summary_haz <- toxins_named %>%
  filter(clean_air_act_chemical == "YES") %>%
  group_by(chemical) %>%
  summarise(total_releases = sum(total_releases, na.rm = TRUE)) %>%
  arrange(desc(total_releases)) %>%
  head(10)



# Plot for Clean Air Act Chemical (Horizontal Bar Plot) - Top 10
p1 <- ggplot(chem_release_summary_haz, aes(x = reorder(chemical, total_releases), y = total_releases, fill = chemical)) +
  geom_bar(stat = "identity", width = 0.7, show.legend = FALSE) +
  coord_flip() +
  labs(title = "Top 10 Toxins Released",
       y = "Total Release",
       x = "Chemical") +
  theme_minimal() +
  theme(axis.text.y = element_text(angle = 0, hjust = 1),
        plot.title = element_text(hjust = 0.5)) +
  scale_y_continuous(labels = scales::comma)

# Now proceed with your summary and plotting code
chem_release_summary_non <- toxins_named %>%
  filter(clean_air_act_chemical == "NO") %>%
  group_by(chemical) %>%
  summarise(total_releases = sum(total_releases, na.rm = TRUE)) %>%
  arrange(desc(total_releases)) %>%
  head(10)

# Plot for Clean Air Act Chemical (Horizontal Bar Plot) - Top 10
p2 <- ggplot(chem_release_summary_non, aes(x = reorder(chemical, total_releases), y = total_releases, fill = chemical)) +
  geom_bar(stat = "identity", width = 0.7, show.legend = FALSE) +
  coord_flip() +
  labs(title = "Top 10 Toxins (Non Hazardous) Released",
       y = "Total Release",
       x = "Chemical") +
  theme_minimal() +
  theme(axis.text.y = element_text(angle = 0, hjust = 1),
        plot.title = element_text(hjust = 0.5)) +
  scale_y_continuous(labels = scales::comma)


# Summarize release amounts by classification
classification_summary <- toxins_named %>%
  group_by(classification) %>%
  summarise(total_release = sum(total_releases, na.rm = TRUE)) %>%
  arrange(desc(total_release))

# Plot for Classification (Top 10)
p3 <- classification_summary %>%
  top_n(10, total_release) %>%
  ggplot(aes(x = reorder(classification, total_release), y = total_release, fill = classification)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Top 10 Toxins Released by Classification",
       y = "Total Release",
       x = "Classification") +
  theme_minimal() +
  theme(legend.position = "none",
        plot.title = element_text(hjust = 0.5)) +
  scale_y_continuous(labels = comma)

# Pivot data to gather different release methods
release_methods <- toxins_named %>%
  select(chemical, fugitive_air, stack_air, water, 
         underground_cl_i, underground_c_ii_v, 
         `1a_rcra_c_landfill`, `1b_other_landfills`, 
         other_disposal) %>%
  pivot_longer(cols = fugitive_air:other_disposal, 
               names_to = "release_method", 
               values_to = "amount") %>%
  group_by(release_method) %>%
  summarise(total_amount = sum(amount, na.rm = TRUE))

# Rename release methods for better readability
release_methods <- release_methods %>%
  mutate(release_method = case_when(
    release_method == "fugitive_air" ~ "Fugitive Air",
    release_method == "stack_air" ~ "Stack Air",
    release_method == "water" ~ "Water",
    release_method == "underground_cl_i" ~ "Underground Class I",
    release_method == "underground_c_ii_v" ~ "Underground Class II-V",
    release_method == "1a_rcra_c_landfill" ~ "RCRA C Landfill",
    release_method == "1b_other_landfills" ~ "Other Landfills",
    release_method == "other_disposal" ~ "Other Disposal",
    TRUE ~ release_method
  ))

# Plot for Different Release Methods
p4 <- ggplot(release_methods, aes(x = reorder(release_method, total_amount), y = total_amount, fill = release_method)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Amounts Released by Different Onsite Methods",
       x = "Release Method",
       y = "Total Amount") +
  theme_minimal() +
  theme(legend.position = "none",
        plot.title = element_text(hjust = 0.5)) +
  scale_y_continuous(labels = comma) +
  scale_fill_brewer(palette = "Set3")

# List of offsite release methods (excluding total)
offsite_methods <- c("m10", "m41", "m62", "m81", "m82", 
                    "m66", "m67", "m64", "m65", "m73", "m79", 
                     "m90", "m94", "m99")

# Pivot data to gather different offsite release methods
offsite_release_methods <- toxins_named %>%
  select(all_of(offsite_methods)) %>%
  pivot_longer(cols = everything(), 
               names_to = "release_method", 
               values_to = "amount") %>%
  group_by(release_method) %>%
  summarise(total_amount = sum(amount, na.rm = TRUE))

# Rename release methods for better readability
offsite_release_methods <- offsite_release_methods %>%
  mutate(release_method = case_when(
    release_method == "m10" ~ "Storage Only",
    release_method == "m41" ~ "Solidification/Stabilization",
    release_method == "m62" ~ "Wastewater Treatment",
    release_method == "m81" ~ "Landfill/Disposal Surface Impoundment",
    release_method == "m82" ~ "Land Treatment",
    release_method == "m66" ~ "Other Waste Treatment",
    release_method == "m67" ~ "Other Waste Treatment",
    release_method == "m64" ~ "Other Disposal",
    release_method == "m65" ~ "Other Disposal",
    release_method == "m73" ~ "Land Treatment",
    release_method == "m79" ~ "Other Land Disposal",
    release_method == "m90" ~ "Other Off-Site Management",
    release_method == "m94" ~ "Transfer to Waste Broker",
    release_method == "m99" ~ "Unknown",
    TRUE ~ release_method
  ))

# Plot for Different Offsite Release Methods
p5 <- ggplot(offsite_release_methods, aes(x = reorder(release_method, total_amount), y = total_amount, fill = release_method)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Amounts Released by Different Offsite Methods",
       x = "Release Method",
       y = "Total Amount") +
  theme_minimal() +
  theme(legend.position = "none",
        plot.title = element_text(hjust = 0.5),
        axis.text.y = element_text(size = 8)) +  # Smaller text for y-axis labels
  scale_y_continuous(labels = comma) +
  scale_fill_brewer(palette = "Set3")

# Calculate proportions for treated, recycled, etc.
proportions <- toxins_named %>%
  summarise(
    total_treated = sum(treatment_on_site + treatment_off_site, na.rm = TRUE),
    total_recycled = sum(recycling_on_site + recycling_off_sit, na.rm = TRUE),
    total_energy_recovery = sum(energy_recover_on + energy_recover_of, na.rm = TRUE),
    total_release = sum(total_releases, na.rm = TRUE)
  ) %>%
  pivot_longer(everything(), names_to = "category", values_to = "amount") %>%
  mutate(proportion = amount / sum(amount),
         category = case_when(
           category == "total_treated" ~ "Treated",
           category == "total_recycled" ~ "Recycled",
           category == "total_energy_recovery" ~ "Energy Recovery",
           category == "total_release" ~ "Released",
           TRUE ~ category
         ))

# Plot for Proportions
p6 <- ggplot(proportions, aes(x = reorder(category, -proportion), y = proportion, fill = category)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = scales::percent(proportion, accuracy = 0.1)), 
            position = position_stack(vjust = 0.5)) +
  labs(title = "Proportion of Toxins Treated, Recycled, Recovered, and Released",
       x = "Category",
       y = "Proportion") +
  theme_minimal() +
  theme(legend.position = "none") +
  scale_y_continuous(labels = scales::percent) +
  scale_fill_brewer(palette = "Set2")

# Calculate on-site and off-site releases
onsite_offsite_summary <- toxins_named %>%
  summarise(
    on_site_release = sum(on_site_release_total, na.rm = TRUE),
    off_site_release = sum(off_site_release_total, na.rm = TRUE)
  ) %>%
  pivot_longer(everything(), names_to = "release_type", values_to = "amount") %>%
  mutate(release_type = case_when(
    release_type == "on_site_release" ~ "On-Site Release",
    release_type == "off_site_release" ~ "Off-Site Release",
    TRUE ~ release_type
  ))

# Calculate percentages
total_release <- sum(onsite_offsite_summary$amount)
onsite_offsite_summary <- onsite_offsite_summary %>%
  mutate(percentage = amount / total_release * 100)

# Plot for On-Site vs Off-Site Releases
p7 <- ggplot(onsite_offsite_summary, aes(x = release_type, y = amount, fill = release_type)) +
  geom_bar(stat = "identity", width = 0.6) +
  geom_text(aes(label = paste0(comma(amount), "\n(", round(percentage, 1), "%)")), 
            position = position_stack(vjust = 0.5), 
            size = 4) +
  labs(title = "Comparison of On-Site vs Off-Site Releases",
       x = NULL,  # Remove x-axis label as it's redundant
       y = "Total Amount",
       fill = "Release Type") +
  theme_minimal() +
  theme(legend.position = "none",  # Remove legend as it's redundant
        axis.text.x = element_text(size = 12, face = "bold"),
        plot.title = element_text(hjust = 0.5, size = 16, face = "bold")) +
  scale_y_continuous(labels = comma) +
  scale_fill_brewer(palette = "Set2")

ggplotly(p1)
ggplotly(p2)
ggplotly(p3)
ggplotly(p4)
ggplotly(p5)
ggplotly(p6)
ggplotly(p7)